Comparison with Gandal dataset




library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(biomaRt) ; library(DESeq2)
library(Rtsne)
library(ClusterR)
library(knitr)

Load preprocessed dataset (preprocessing code in 20_03_30_data_preprocessing.Rmd)

# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame

# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)

# Update DE_info with Neuronal information
DE_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(GO_neuronal, by='ID') %>%
          mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
          mutate(significant=padj<0.05 & !is.na(padj))

rm(GO_annotations)


All samples together

plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr))
p1 = ggplotly(plot_data %>% ggplot(aes(Mean)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
              scale_x_log10() + theme_minimal())

plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr))
p2 = ggplotly(plot_data %>% ggplot(aes(Mean)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
              theme_minimal() + ggtitle('Mean expression density by Gene (left) and by Sample (right)'))

subplot(p1, p2, nrows=1)
rm(p1, p2, plot_data)

Grouping genes by Neuronal tag and samples by Diagnosis

  • The genes with Neuronal function have a higher expression distribution than the ones without it

  • The ASD group has higher levels of expression than the Control group

plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr)) %>% 
            left_join(GO_neuronal, by='ID') %>% mutate('Neuronal'=ifelse(is.na(Neuronal),F,T))
p1 = plot_data %>% ggplot(aes(Mean, color=Neuronal, fill=Neuronal)) + geom_density(alpha=0.3) +
                   scale_x_log10() + theme_minimal() + theme(legend.position='bottom') + 
                   ggtitle('Mean expression density by gene')

plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr)) %>% 
            left_join(datMeta, by=c('ID'='ID'))
p2 = plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) + geom_density(alpha=0.3) +
                   theme_minimal() + theme(legend.position='bottom') + 
                   ggtitle('Mean expression density by Sample')

grid.arrange(p1, p2, nrow=1)

rm(GO_annotations, plot_data, p1, p2)


ASD vs CTL

In general there doesn’t seem to be a lot of variance in mean expression between autism and control samples by gene

plot_data = data.frame('ID'=rownames(datExpr),
                       'ASD'=rowMeans(datExpr[,datMeta$Diagnosis=='ASD']),
                       'CTL'=rowMeans(datExpr[,datMeta$Diagnosis!='ASD'])) %>%
                       mutate(diff=ASD-CTL, abs_diff = abs(ASD-CTL)) %>%
                       mutate(std_diff = (diff-mean(diff))/sd(diff), distance = abs((diff-mean(diff))/sd(diff)))

plot_data %>% ggplot(aes(ASD, CTL, color = distance)) + geom_point(alpha = plot_data$abs_diff) + geom_abline(color = 'gray') +
              scale_color_viridis(direction = -1) + ggtitle('Mean expression ASD vs CTL') + theme_minimal() + coord_fixed()

summary(plot_data$std_diff)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -4.39123 -0.63561 -0.01915  0.00000  0.60240 11.13999
cat(paste0('There are ', sum(plot_data$distance>3), ' genes with a difference between Diagnoses larger than 3 SD to ',
           'the distance distribution of all genes'))
## There are 110 genes with a difference between Diagnoses larger than 3 SD to the distance distribution of all genes
#cat(paste0('Outlier genes: ', paste(plot_data$ID[abs(plot_data$std_diff)>3], collapse = ', ')))


Grouping genes and samples by Diagnosis

  • There doesn’t seem to be a noticeable difference between mean expression by gene between Diagnosis groups

  • Samples with autism tend to have a narrower higher level of expression than the control group (as we had already seen above)

plot_data = rbind(data.frame('Mean'=rowMeans(datExpr[,datMeta$Diagnosis=='ASD']), 'Diagnosis'='ASD'),
                  data.frame('Mean'=rowMeans(datExpr[,datMeta$Diagnosis!='ASD']), 'Diagnosis'='CTL')) %>%
            mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
p1 = ggplotly(plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) + 
              geom_density(alpha=0.3) + scale_x_log10() + theme_minimal())

plot_data = rbind(data.frame('Mean'=colMeans(datExpr[,datMeta$Diagnosis=='ASD']), 'Diagnosis'='ASD'),
                  data.frame('Mean'=colMeans(datExpr[,datMeta$Diagnosis!='ASD']), 'Diagnosis'='CTL')) %>%
            mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
p2 = ggplotly(plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) + 
              geom_density(alpha=0.3) + theme_minimal() +
              ggtitle('Mean expression by Gene (left) and by Sample (right) grouped by Diagnosis'))

subplot(p1, p2, nrows=1)
rm(p1, p2, plot_data)




Visualisations


Samples

PCA

The first principal component seems to separate relatively well the two diagnosis

pca = datExpr %>% t %>% prcomp

plot_data = data.frame('ID'=colnames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>% 
            left_join(datMeta, by=c('ID'='ID')) %>% 
            dplyr::select('ID','PC1','PC2','Diagnosis') %>% 
            mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))

plot_data %>% ggplot(aes(PC1, PC2, color=Diagnosis)) + geom_point() + theme_minimal() + ggtitle('PCA') +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))

rm(pca, plot_data)


MDS

Looks exactly the same as the PCA visualisation, just inverting the x axis

d = datExpr %>% t %>% dist
fit = cmdscale(d, k=2)

plot_data = data.frame('ID'=colnames(datExpr), 'C1'=fit[,1], 'C2'=fit[,2]) %>%
            left_join(datMeta, by=c('ID'='ID')) %>% 
            dplyr::select('C1','C2','Diagnosis') %>%
            mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))

plot_data %>% ggplot(aes(C1, C2, color=Diagnosis)) + geom_point() + theme_minimal() + ggtitle('MDS')

rm(d, fit, plot_data)


t-SNE

Higher perplexities seem to capture the difference between diagnosis better, although at the end they seem to be capturing another pattern as well, since the samples seem to be grouped in pairs

perplexities = c(2,5,10,30)
ps = list()

for(i in 1:length(perplexities)){
  set.seed(123)
  tsne = datExpr %>% t %>% Rtsne(perplexity=perplexities[i])
  plot_data = data.frame('ID'=colnames(datExpr), 'C1'=tsne$Y[,1], 'C2'=tsne$Y[,2]) %>%
              left_join(datMeta, by=c('ID'='ID')) %>%
              dplyr::select('C1','C2','Diagnosis','Subject_ID') %>%
              mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
  ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=Diagnosis)) + geom_point() + theme_minimal() +
            ggtitle(paste0('Perplexity=',perplexities[i])) + theme(legend.position='none')
}

grid.arrange(grobs=ps, nrow=2)

rm(ps, perplexities, tsne, i)


In the Gandal dataset, the higher perplexity values managed to capture the subject the samples belonged to, but it doesn’t seem to do it with this new dataset

ggplotly(plot_data %>% ggplot(aes(C1, C2, color=Subject_ID)) + geom_point(aes(id=Subject_ID)) + theme_minimal() + 
         theme(legend.position='none') + ggtitle('t-SNE Perplexity=30 coloured by Subject ID'))
rm(plot_data)

Genes

PCA

  • First Principal Component explains over 96% of the total variance (Less than Gandal’s)

  • There’s a really strong correlation between the mean expression of a gene and the 1st principal component

  • There is one big outlier in the 2nd principal component

pca = datExpr %>% prcomp

plot_data = data.frame( 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'MeanExpr'=rowMeans(datExpr))

plot_data %>% ggplot(aes(PC1, PC2, color=MeanExpr)) + geom_point(alpha=0.3) + theme_minimal() + 
              scale_color_viridis() + ggtitle('PCA') +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))

rm(pca, plot_data)

t-SNE

Higher perplexities capture a cleaner visualisation of the data ordered by mean expression, in a similar (although not as linear) way to PCA

perplexities = c(1,2,5,10,50,100)
ps = list()

for(i in 1:length(perplexities)){
  tsne = read.csv(paste0('./../Visualisations/tsne_perplexity_',perplexities[i],'.csv'))
  plot_data = data.frame('C1'=tsne[,1], 'C2'=tsne[,2], 'MeanExpr'=rowMeans(datExpr))
  ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=MeanExpr)) + geom_point(alpha=0.5) + theme_minimal() +
            scale_color_viridis() + ggtitle(paste0('Perplexity=',perplexities[i])) + 
            theme(legend.position='none') + coord_fixed()
}

grid.arrange(grobs=ps, nrow=2)

rm(perplexities, ps, tsne, i)


Differential Expression Analysis

  • Only 944 genes (~7% vs ~26% in Gandal’s dataset) are significant using a threshold of 0.05 for the adjusted p-value and a without a log Fold Change threshold (keeping the null hypothesis \(H_0: lfc=0\))

  • Many genes weren’t assigned an adjusted p-value

table(DE_info$padj<0.05, useNA='ifany')
## 
## FALSE  TRUE  <NA> 
## 10629   944  2125
p = DE_info %>% ggplot(aes(log2FoldChange, padj, color=significant)) + geom_point(alpha=0.2) + 
    scale_y_sqrt() + xlab('log2 Fold Change') + ylab('Adjusted p-value') + theme_minimal()
ggExtra::ggMarginal(p, type = 'density', color='gray', fill='gray', size=10)

rm(p)
  • There is a clear negative relation between lfc and mean expression, for both differentially expressed and not differentially expressed groups of genes

  • The relation is strongest for genes with low levels of expression

plot_data = data.frame('ID'=rownames(datExpr), 'meanExpr'=rowMeans(datExpr)) %>% left_join(DE_info, by='ID') %>%
            mutate('statisticallySignificant' = ifelse(is.na(padj),NA, ifelse(padj<0.05, TRUE, FALSE)),
                   'alpha' = ifelse(padj>0.05 | is.na(padj), 0.1, 0.5))

plot_data %>% ggplot(aes(meanExpr, abs(log2FoldChange), color=statisticallySignificant)) + 
              geom_point(alpha = plot_data$alpha) + geom_smooth(method='lm') + 
              theme_minimal() + scale_y_sqrt() + theme(legend.position = 'bottom') +
              xlab('Mean Expression') + ylab('abs(lfc)') + ggtitle('Log fold change by level of expression')

  • When filtering for differential expression, the points separate into two clouds depending on whether they are over or underexpressed

  • The top cloud corresponds to the under expressed genes and the bottom to the over expressed ones

datExpr_DE = datExpr[DE_info$significant,]

pca = datExpr_DE %>% prcomp

plot_data = cbind(data.frame('PC1'=pca$x[,1], 'PC2'=pca$x[,2]), DE_info[DE_info$significant==TRUE,])

pos_zero = -min(plot_data$log2FoldChange)/(max(plot_data$log2FoldChange)-min(plot_data$log2FoldChange))
p = plot_data %>% ggplot(aes(PC1, PC2, color=log2FoldChange)) + geom_point(alpha=0.5) +
                  scale_color_gradientn(colours=c('#F8766D','#faa49e','white','#00BFC4','#009499'), 
                                        values=c(0, pos_zero-0.1, pos_zero, pos_zero+0.1, 1)) +
                  theme_minimal() + ggtitle('
PCA of differentially expressed genes') + # This is on purpose, PDF doesn't save well without this white space (?)
                  xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
                  ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) + 
                  theme(legend.position = 'bottom')
ggExtra::ggMarginal(p, type='density', color='gray', fill='gray', size=10)

rm(pos_zero, p)

Separating the genes into these two groups: Salmon: under-expressed, aqua: over-expressed

plot_data = plot_data %>% mutate('Group'=ifelse(log2FoldChange>0,'overexpressed','underexpressed')) %>%
            mutate('Group' = factor(Group, levels=c('underexpressed','overexpressed')))
plot_data %>% ggplot(aes(PC1, PC2, color=Group)) + geom_point(alpha=0.4) + 
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
              theme_minimal() + ggtitle('PCA')

rm(pca)

List of top DE genes

  • The genes with the highest LFC are overexpressed genes and most none has a Neuronal tag
# Get genes names
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl', host='feb2014.archive.ensembl.org')
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=plot_data$ID, mart=mart) %>% 
             rename(external_gene_id = 'gene_name', ensembl_gene_id = 'ID')
top_genes = plot_data %>% left_join(gene_names, by='ID') %>% arrange(-abs(log2FoldChange)) %>% 
            top_n(50, wt=log2FoldChange)

kable(top_genes %>% dplyr::select(ID, gene_name, log2FoldChange, padj, Neuronal, Group))
ID gene_name log2FoldChange padj Neuronal Group
ENSG00000256618 MTRNR2L1 1.9916981 0.0401708 0 overexpressed
ENSG00000137959 IFI44L 1.8535885 0.0000893 0 overexpressed
ENSG00000126709 IFI6 1.5571198 0.0000035 0 overexpressed
ENSG00000224982 TMEM233 1.3304682 0.0048144 0 overexpressed
ENSG00000186470 BTN3A2 1.1915207 0.0003132 0 overexpressed
ENSG00000150337 FCGR1A 1.1902867 0.0359492 0 overexpressed
ENSG00000196126 HLA-DRB1 1.1625629 0.0068988 0 overexpressed
ENSG00000090920 FCGBP 1.1389915 0.0092884 0 overexpressed
ENSG00000188536 HBA2 1.0685930 0.0354139 0 overexpressed
ENSG00000206172 HBA1 1.0550535 0.0048144 0 overexpressed
ENSG00000187608 ISG15 1.0270437 0.0057662 0 overexpressed
ENSG00000167754 KLK5 1.0203547 0.0250979 0 overexpressed
ENSG00000185955 C7orf61 0.9949872 0.0339392 0 overexpressed
ENSG00000130303 BST2 0.9830483 0.0003724 0 overexpressed
ENSG00000133048 CHI3L1 0.9654080 0.0259421 0 overexpressed
ENSG00000119917 IFIT3 0.9623987 0.0033546 0 overexpressed
ENSG00000133321 RARRES3 0.9537325 0.0001111 0 overexpressed
ENSG00000107201 DDX58 0.9531294 0.0014643 0 overexpressed
ENSG00000181019 NQO1 0.9390997 0.0036609 1 overexpressed
ENSG00000197747 S100A10 0.9342678 0.0021598 0 overexpressed
ENSG00000033867 SLC4A7 0.9330656 0.0000464 0 overexpressed
ENSG00000159403 C1R 0.9329289 0.0000893 0 overexpressed
ENSG00000122641 INHBA 0.9115749 0.0436385 0 overexpressed
ENSG00000132274 TRIM22 0.9085575 0.0205875 0 overexpressed
ENSG00000140961 OSGIN1 0.9041925 0.0194605 0 overexpressed
ENSG00000105559 PLEKHA4 0.9027897 0.0012494 0 overexpressed
ENSG00000084093 REST 0.8858131 0.0037218 1 overexpressed
ENSG00000181722 ZBTB20 0.8772604 0.0011067 0 overexpressed
ENSG00000182326 C1S 0.8729420 0.0022401 0 overexpressed
ENSG00000028277 POU2F2 0.8627281 0.0018340 0 overexpressed
ENSG00000054179 ENTPD2 0.8498669 0.0097728 0 overexpressed
ENSG00000152952 PLOD2 0.8421405 0.0332803 0 overexpressed
ENSG00000025708 TYMP 0.8388151 0.0179278 0 overexpressed
ENSG00000186818 LILRB4 0.8384667 0.0273850 0 overexpressed
ENSG00000163840 DTX3L 0.8378994 0.0208838 0 overexpressed
ENSG00000102265 TIMP1 0.8374703 0.0016570 0 overexpressed
ENSG00000077264 PAK3 0.8177574 0.0003130 0 overexpressed
ENSG00000185885 IFITM1 0.8128928 0.0133938 0 overexpressed
ENSG00000068079 IFI35 0.8037349 0.0025741 0 overexpressed
ENSG00000067082 KLF6 0.7803468 0.0000603 0 overexpressed
ENSG00000070778 PTPN21 0.7677702 0.0110275 0 overexpressed
ENSG00000155363 MOV10 0.7673794 0.0068575 0 overexpressed
ENSG00000157150 TIMP4 0.7649290 0.0183559 0 overexpressed
ENSG00000173369 C1QB 0.7570775 0.0318371 0 overexpressed
ENSG00000147113 CXorf36 0.7558034 0.0173219 0 overexpressed
ENSG00000115084 SLC35F5 0.7553919 0.0200686 0 overexpressed
ENSG00000102805 CLN5 0.7515606 0.0040051 1 overexpressed
ENSG00000167414 GNG8 0.7481608 0.0460567 0 overexpressed
ENSG00000002933 TMEM176A 0.7431177 0.0177139 0 overexpressed
ENSG00000117620 SLC35A3 0.7401913 0.0153041 0 overexpressed
rm(top_genes)

Plotting the mean expression by group in Gandal’s dataset there seemed to exist underlying distributions, so we would use GMM to separate them, but everything seems very homogeneous here, so this doesn’t seem to be necessary. (If we do it anyway we can see that they still cluster by mean expression, which makes sense since it explains the majority of the variance of the genes)

Under-expressed ASD genes tend to have a higher mean expression than over-expressed ones (same as with Gandal’s data)

gg_colour_hue = function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

tot_n_clusters = 4

plot_data = plot_data %>% mutate('MeanExpr'=rowMeans(datExpr_DE), 'SDExpr'=apply(datExpr_DE,1,sd))

GMM_G1 = plot_data %>% filter(Group=='overexpressed') %>% dplyr::select(MeanExpr) %>% GMM(2)
GMM_G2 = plot_data %>% filter(Group=='underexpressed') %>% dplyr::select(MeanExpr) %>% GMM(2)

memberships_G1 = data.frame('ID'=plot_data$ID[plot_data$Group=='overexpressed'],
                            'Membership'=GMM_G1$Log_likelihood %>% 
                            apply(1, function(x) glue('over_', which.max(x))))
memberships_G2 = data.frame('ID'=plot_data$ID[plot_data$Group=='underexpressed'],
                            'Membership'=GMM_G2$Log_likelihood %>%
                            apply(1, function(x) glue('under_', which.max(x))))

plot_data = rbind(memberships_G1, memberships_G2) %>% left_join(plot_data, by='ID')

p1 = plot_data %>% ggplot(aes(x=MeanExpr, color=Group, fill=Group)) + geom_density(alpha=0.4) + 
     theme_minimal() + theme(legend.position='bottom')

p2 = plot_data %>% ggplot(aes(x=MeanExpr)) +
     stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[1],
                   args=list(mean=GMM_G1$centroids[1], sd=GMM_G1$covariance_matrices[1])) +
     stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[2],
                   args=list(mean=GMM_G1$centroids[2], sd=GMM_G1$covariance_matrices[2])) +
     stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[3],
                   args=list(mean=GMM_G2$centroids[1], sd=GMM_G2$covariance_matrices[1])) +
     stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[4],
                  args=list(mean=GMM_G2$centroids[2], sd=GMM_G2$covariance_matrices[2])) +
     theme_minimal()

p3 = plot_data %>% ggplot(aes(PC1, PC2, color=Membership)) + geom_point(alpha=0.4) + theme_minimal() + 
     theme(legend.position='bottom')

grid.arrange(p1, p2, p3, nrow=1)

rm(gg_colour_hue, GMM_G1, GMM_G2, memberships_G1, memberships_G2, p1, p2, p3, tot_n_clusters)

For previous preprocessing pipelines, the pattern found above was also present in the standard deviation, but there doesn’t seem to be any distinguishable patterns now. This could be because the variance was almost homogenised with the vst normalisation algorithm.

plot_data %>% ggplot(aes(x=SDExpr, color=Group, fill=Group)) + geom_density(alpha=0.4) + theme_minimal()

rm(plot_data)





Effects of modifying the log fold change threshold

fc_list = seq(1, 1.1, 0.003)

n_genes = nrow(datExpr)

# Calculate PCAs
datExpr_pca_samps = datExpr %>% data.frame %>% t %>% prcomp(scale.=TRUE)
datExpr_pca_genes = datExpr %>% data.frame %>% prcomp(scale.=TRUE)

# Initialise DF to save PCA outputs
pcas_samps = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
             mutate('ID'=colnames(datExpr), 'fc'=0, PC1=scale(PC1), PC2=scale(PC2))
pcas_genes = datExpr_pca_genes$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
             mutate('ID'=rownames(datExpr), 'fc'=0, PC1=scale(PC1), PC2=scale(PC2))

pca_samps_old = pcas_samps
pca_genes_old = pcas_genes

for(fc in fc_list){
  
  # Recalculate DE_info with the new threshold (p-values change) an filter DE genes
  DE_genes = results(dds, lfcThreshold=log2(fc), altHypothesis='greaterAbs') %>% data.frame %>%
             mutate('ID'=rownames(.)) %>% filter(padj<0.05)
  
  datExpr_DE = datExpr %>% data.frame %>% filter(rownames(.) %in% DE_genes$ID)
  n_genes = c(n_genes, nrow(DE_genes))
  
  # Calculate PCAs
  datExpr_pca_samps = datExpr_DE %>% t %>% prcomp(scale.=TRUE)
  datExpr_pca_genes = datExpr_DE %>% prcomp(scale.=TRUE)

  # Create new DF entries
  pca_samps_new = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
                  mutate('ID'=colnames(datExpr), 'fc'=fc, PC1=scale(PC1), PC2=scale(PC2))
  pca_genes_new = datExpr_pca_genes$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
                  mutate('ID'=DE_genes$ID, 'fc'=fc, PC1=scale(PC1), PC2=scale(PC2))  
  
  # Change PC sign if necessary
  if(cor(pca_samps_new$PC1, pca_samps_old$PC1)<0) pca_samps_new$PC1 = -pca_samps_new$PC1
  if(cor(pca_samps_new$PC2, pca_samps_old$PC2)<0) pca_samps_new$PC2 = -pca_samps_new$PC2
  if(cor(pca_genes_new[pca_genes_new$ID %in% pca_genes_old$ID,]$PC1,
         pca_genes_old[pca_genes_old$ID %in% pca_genes_new$ID,]$PC1)<0){
    pca_genes_new$PC1 = -pca_genes_new$PC1
  }
  if(cor(pca_genes_new[pca_genes_new$ID %in% pca_genes_old$ID,]$PC2, 
         pca_genes_old[pca_genes_old$ID %in% pca_genes_new$ID,]$PC2 )<0){
    pca_genes_new$PC2 = -pca_genes_new$PC2
  }
  
  pca_samps_old = pca_samps_new
  pca_genes_old = pca_genes_new
  
  # Update DFs
  pcas_samps = rbind(pcas_samps, pca_samps_new)
  pcas_genes = rbind(pcas_genes, pca_genes_new)
  
}

# Add Diagnosis/SFARI score information
pcas_samps = pcas_samps %>% left_join(datMeta, by=c('ID')) %>%
             dplyr::select(ID, PC1, PC2, fc, Diagnosis, brain_lobe)
# pcas_genes = pcas_genes %>% left_join(SFARI_genes, by='ID') %>% 
#              mutate('score'=as.factor(`gene-score`)) %>%
#              dplyr::select(ID, PC1, PC2, lfc, score)

# Plot change of number of genes
ggplotly(data.frame('fc'=fc_list, 'n_genes'=n_genes[-1]) %>% ggplot(aes(x=fc, y=n_genes)) + 
         geom_point() + geom_line() + theme_minimal() + xlab('Fold Change') +
         ggtitle('Number of remaining genes when modifying filtering threshold'))
rm(fc_list, n_genes, fc, pca_samps_new, pca_genes_new, pca_samps_old, pca_genes_old, 
   datExpr_pca_samps, datExpr_pca_genes)


Samples

Note: PC values get smaller as Log2 fold change increases, so on each iteration the values were scaled so it would be easier to compare between frames

Coloured by Diagnosis:

The lfc threshold doesn’t seem to make a big difference for differentiating genes by Diagnosis

ggplotly(pcas_samps %>% ggplot(aes(PC1, PC2, color=Diagnosis)) + geom_point(aes(frame=fc, ids=ID)) + 
         theme_minimal() + ggtitle('Samples PCA plot modifying filtering threshold'))


Coloured by brain region:

There doesn’t seem to be any strong recognisable pattern, perhaps the samples from the Occipital lobe have lower PC1 values than the ones from the Frontal lobe

ggplotly(pcas_samps %>% ggplot(aes(PC1, PC2, color=brain_lobe)) + geom_point(aes(frame=fc, ids=ID)) + 
         theme_minimal() + ggtitle('Samples PCA plot modifying filtering threshold'))


Genes

if(!'fcSign' %in% colnames(pcas_genes)){
  pcas_genes = pcas_genes %>% left_join(DE_info, by='ID') %>% mutate(fcSign = ifelse(log2FoldChange>0,'Over-expressed','Under-expressed')) 
}

ggplotly(pcas_genes %>% mutate(abs_lfc=ifelse(fc==-1,-1,round(log2(fc),3))) %>% 
         ggplot(aes(PC1, PC2, color=fcSign)) + geom_point(aes(frame=abs_lfc, ids=ID, alpha=0.1)) + 
         theme_minimal() + ggtitle('Genes PCA plot modifying |LFC| filtering threshold'))




Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] knitr_1.28                  ClusterR_1.2.1             
##  [3] gtools_3.8.2                Rtsne_0.15                 
##  [5] DESeq2_1.24.0               SummarizedExperiment_1.14.1
##  [7] DelayedArray_0.10.0         BiocParallel_1.18.1        
##  [9] matrixStats_0.56.0          Biobase_2.44.0             
## [11] GenomicRanges_1.36.1        GenomeInfoDb_1.20.0        
## [13] IRanges_2.18.3              S4Vectors_0.22.1           
## [15] BiocGenerics_0.30.0         biomaRt_2.40.5             
## [17] GGally_1.5.0                gridExtra_2.3              
## [19] viridis_0.5.1               viridisLite_0.3.0          
## [21] RColorBrewer_1.1-2          plotlyutils_0.0.0.9000     
## [23] plotly_4.9.2                glue_1.3.2                 
## [25] reshape2_1.4.3              forcats_0.5.0              
## [27] stringr_1.4.0               dplyr_0.8.5                
## [29] purrr_0.3.3                 readr_1.3.1                
## [31] tidyr_1.0.2                 tibble_3.0.0               
## [33] ggplot2_3.3.0               tidyverse_1.3.0            
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_1.4-1       ellipsis_0.3.0         htmlTable_1.13.3      
##   [4] XVector_0.24.0         base64enc_0.1-3        fs_1.4.0              
##   [7] rstudioapi_0.11        farver_2.0.3           bit64_0.9-7           
##  [10] AnnotationDbi_1.46.1   fansi_0.4.1            lubridate_1.7.4       
##  [13] xml2_1.2.5             splines_3.6.3          geneplotter_1.62.0    
##  [16] Formula_1.2-3          jsonlite_1.6.1         annotate_1.62.0       
##  [19] broom_0.5.5            cluster_2.1.0          dbplyr_1.4.2          
##  [22] png_0.1-7              shiny_1.4.0.2          compiler_3.6.3        
##  [25] httr_1.4.1             backports_1.1.5        fastmap_1.0.1         
##  [28] assertthat_0.2.1       Matrix_1.2-18          lazyeval_0.2.2        
##  [31] cli_2.0.2              later_1.0.0            acepack_1.4.1         
##  [34] htmltools_0.4.0        prettyunits_1.1.1      tools_3.6.3           
##  [37] gmp_0.5-13.6           gtable_0.3.0           GenomeInfoDbData_1.2.1
##  [40] Rcpp_1.0.4             cellranger_1.1.0       vctrs_0.2.4           
##  [43] nlme_3.1-144           crosstalk_1.1.0.1      xfun_0.12             
##  [46] rvest_0.3.5            mime_0.9               miniUI_0.1.1.1        
##  [49] lifecycle_0.2.0        XML_3.99-0.3           zlibbioc_1.30.0       
##  [52] scales_1.1.0           promises_1.1.0         hms_0.5.3             
##  [55] curl_4.3               yaml_2.2.1             memoise_1.1.0         
##  [58] rpart_4.1-15           ggExtra_0.9            reshape_0.8.8         
##  [61] latticeExtra_0.6-29    stringi_1.4.6          RSQLite_2.2.0         
##  [64] highr_0.8              genefilter_1.66.0      checkmate_2.0.0       
##  [67] rlang_0.4.5            pkgconfig_2.0.3        bitops_1.0-6          
##  [70] evaluate_0.14          lattice_0.20-40        labeling_0.3          
##  [73] htmlwidgets_1.5.1      bit_1.1-15.2           tidyselect_1.0.0      
##  [76] plyr_1.8.6             magrittr_1.5           R6_2.4.1              
##  [79] generics_0.0.2         Hmisc_4.4-0            DBI_1.1.0             
##  [82] mgcv_1.8-31            pillar_1.4.3           haven_2.2.0           
##  [85] foreign_0.8-75         withr_2.1.2            survival_3.1-11       
##  [88] RCurl_1.98-1.1         nnet_7.3-13            modelr_0.1.6          
##  [91] crayon_1.3.4           rmarkdown_2.1          jpeg_0.1-8.1          
##  [94] progress_1.2.2         locfit_1.5-9.4         grid_3.6.3            
##  [97] readxl_1.3.1           data.table_1.12.8      blob_1.2.1            
## [100] reprex_0.3.0           digest_0.6.25          xtable_1.8-4          
## [103] httpuv_1.5.2           munsell_0.5.0